Replication of "FRED-MD: A Monthly Database for Macroeconomic Research"¶
Libraries¶
import pandas as pd
import numpy as np
import requests
import io
import certifi
from scipy import linalg
from numpy.linalg import svd
import matplotlib.pyplot as plt
from IPython.display import display, HTML, Image
import re
import statsmodels.api as sm
from itertools import product
import time
Data¶
Collecting Data¶
The dataset used in this project was originally downloaded from the FRED-MD Monthly Databases for Macroeconomic Research, available on the Federal Reserve Bank of St. Louis website here.
To ensure reproducibility and consistent access to the data, a copy of the CSV file has been uploaded to a public GitHub repository.
The corresponding file on the FRED-MD website is named 2025-05, reflecting the dataset version used in this project.
URL = ('https://raw.githubusercontent.com/tdistudent27/fred-md-data/refs/heads/main/2025-05.csv')
response = requests.get(URL, timeout=30, verify=certifi.where())
response.raise_for_status() # lancia eccezione se HTTP≠200
df = pd.read_csv(io.BytesIO(response.content))
print(df)
sasdate RPI W875RX1 DPCERA3M086SBEA CMRMTSPLx \
0 Transform: 5.000 5.0 5.000 5.000000e+00
1 1/1/1959 2583.560 2426.0 15.188 2.766768e+05
2 2/1/1959 2593.596 2434.8 15.346 2.787140e+05
3 3/1/1959 2610.396 2452.7 15.491 2.777753e+05
4 4/1/1959 2627.446 2470.0 15.435 2.833627e+05
.. ... ... ... ... ...
791 11/1/2024 20091.169 16376.8 122.396 1.545040e+06
792 12/1/2024 20101.629 16387.7 123.077 1.558008e+06
793 1/1/2025 20148.969 16391.2 122.614 1.543178e+06
794 2/1/2025 20209.351 16389.5 122.742 1.556553e+06
795 3/1/2025 20311.260 16500.4 123.601 NaN
RETAILx INDPRO IPFPNSS IPFINAL IPCONGD ... \
0 5.00000 5.0000 5.0000 5.0000 5.0000 ...
1 17689.23968 21.9616 23.3868 22.2620 31.6664 ...
2 17819.01912 22.3917 23.7024 22.4549 31.8987 ...
3 17967.91336 22.7142 23.8459 22.5651 31.8987 ...
4 17978.97983 23.1981 24.1903 22.8957 32.4019 ...
.. ... ... ... ... ... ...
791 712145.00000 101.9619 99.3808 98.8609 100.8691 ...
792 717662.00000 103.1177 100.4976 99.9719 101.6868 ...
793 711461.00000 103.3418 101.0766 100.6319 102.1879 ...
794 711680.00000 104.2202 101.8233 101.4377 102.7245 ...
795 722025.00000 103.8892 101.6665 101.1465 101.7332 ...
DNDGRG3M086SBEA DSERRG3M086SBEA CES0600000008 CES2000000008 \
0 6.000 6.000 6.00 6.00
1 18.294 10.152 2.13 2.45
2 18.302 10.167 2.14 2.46
3 18.289 10.185 2.15 2.45
4 18.300 10.221 2.16 2.47
.. ... ... ... ...
791 119.230 129.380 31.59 36.26
792 119.746 129.875 31.72 36.43
793 120.457 130.281 31.91 36.56
794 120.615 130.990 32.00 36.66
795 119.760 131.192 32.21 36.79
CES3000000008 UMCSENTx DTCOLNVHFNM DTCTHFNM INVEST VIXCLSx
0 6.00 2.0 6.00 6.00 6.0000 1.0000
1 2.04 NaN 6476.00 12298.00 84.2043 NaN
2 2.05 NaN 6476.00 12298.00 83.5280 NaN
3 2.07 NaN 6508.00 12349.00 81.6405 NaN
4 2.08 NaN 6620.00 12484.00 81.8099 NaN
.. ... ... ... ... ... ...
791 28.22 71.8 556011.41 938335.20 5381.4576 15.9822
792 28.33 74.0 559364.75 943484.76 5366.6686 15.6997
793 28.58 71.7 559087.09 944167.06 5350.2541 16.8122
794 28.68 64.7 556142.06 941199.49 5367.9408 17.0705
795 28.92 57.0 NaN NaN 5406.5887 21.6579
[796 rows x 127 columns]
Extracting TCODE variables¶
# Extract the first row, which contains the transformation codes
tcode_row = df.iloc[0]
# Drop the 'sasdate' column since it's not an economic variable
tcode_row = tcode_row.drop('sasdate')
# Create a one-row DataFrame with the variable names as columns
# and the index labeled as 'Transform'
tcode_df = pd.DataFrame([tcode_row.values], columns=tcode_row.index, index=['TCODE'])
# Extract the first row, which contains the transformation codes
tcode_df = df.iloc[0]
# Drop the 'sasdate' column since it's not an economic variable
tcode_df = tcode_df.drop('sasdate')
# Create a one-row DataFrame with the variable names as columns
# and the index labeled as 'Transform'
tcode_df = pd.DataFrame([tcode_df.values], columns=tcode_df.index, index=['TCODE'])
print(tcode_df)
RPI W875RX1 DPCERA3M086SBEA CMRMTSPLx RETAILx INDPRO IPFPNSS \
TCODE 5.0 5.0 5.0 5.0 5.0 5.0 5.0
IPFINAL IPCONGD IPDCONGD ... DNDGRG3M086SBEA DSERRG3M086SBEA \
TCODE 5.0 5.0 5.0 ... 6.0 6.0
CES0600000008 CES2000000008 CES3000000008 UMCSENTx DTCOLNVHFNM \
TCODE 6.0 6.0 6.0 2.0 6.0
DTCTHFNM INVEST VIXCLSx
TCODE 6.0 6.0 1.0
[1 rows x 126 columns]
Adjusting the dataset¶
# Drop the first row (index 0) which contains the transformation codes
df = df.iloc[1:].reset_index(drop=True)
# Rename 'sasdate' to 'Date'
df = df.rename(columns={'sasdate': 'Date'})
# Convert 'Date' to datetime in ISO format (YYYY-MM-DD)
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')
# Convert all columns except 'Date' to numeric values
df[df.columns.difference(['Date'])] = df[df.columns.difference(['Date'])].apply(pd.to_numeric, errors='coerce')
# Check
print(df)
Date RPI W875RX1 DPCERA3M086SBEA CMRMTSPLx \
0 1959-01-01 2583.560 2426.0 15.188 2.766768e+05
1 1959-02-01 2593.596 2434.8 15.346 2.787140e+05
2 1959-03-01 2610.396 2452.7 15.491 2.777753e+05
3 1959-04-01 2627.446 2470.0 15.435 2.833627e+05
4 1959-05-01 2642.720 2486.4 15.622 2.853072e+05
.. ... ... ... ... ...
790 2024-11-01 20091.169 16376.8 122.396 1.545040e+06
791 2024-12-01 20101.629 16387.7 123.077 1.558008e+06
792 2025-01-01 20148.969 16391.2 122.614 1.543178e+06
793 2025-02-01 20209.351 16389.5 122.742 1.556553e+06
794 2025-03-01 20311.260 16500.4 123.601 NaN
RETAILx INDPRO IPFPNSS IPFINAL IPCONGD ... \
0 17689.23968 21.9616 23.3868 22.2620 31.6664 ...
1 17819.01912 22.3917 23.7024 22.4549 31.8987 ...
2 17967.91336 22.7142 23.8459 22.5651 31.8987 ...
3 17978.97983 23.1981 24.1903 22.8957 32.4019 ...
4 18119.82573 23.5476 24.3911 23.1161 32.5567 ...
.. ... ... ... ... ... ...
790 712145.00000 101.9619 99.3808 98.8609 100.8691 ...
791 717662.00000 103.1177 100.4976 99.9719 101.6868 ...
792 711461.00000 103.3418 101.0766 100.6319 102.1879 ...
793 711680.00000 104.2202 101.8233 101.4377 102.7245 ...
794 722025.00000 103.8892 101.6665 101.1465 101.7332 ...
DNDGRG3M086SBEA DSERRG3M086SBEA CES0600000008 CES2000000008 \
0 18.294 10.152 2.13 2.45
1 18.302 10.167 2.14 2.46
2 18.289 10.185 2.15 2.45
3 18.300 10.221 2.16 2.47
4 18.280 10.238 2.17 2.48
.. ... ... ... ...
790 119.230 129.380 31.59 36.26
791 119.746 129.875 31.72 36.43
792 120.457 130.281 31.91 36.56
793 120.615 130.990 32.00 36.66
794 119.760 131.192 32.21 36.79
CES3000000008 UMCSENTx DTCOLNVHFNM DTCTHFNM INVEST VIXCLSx
0 2.04 NaN 6476.00 12298.00 84.2043 NaN
1 2.05 NaN 6476.00 12298.00 83.5280 NaN
2 2.07 NaN 6508.00 12349.00 81.6405 NaN
3 2.08 NaN 6620.00 12484.00 81.8099 NaN
4 2.08 95.3 6753.00 12646.00 80.7315 NaN
.. ... ... ... ... ... ...
790 28.22 71.8 556011.41 938335.20 5381.4576 15.9822
791 28.33 74.0 559364.75 943484.76 5366.6686 15.6997
792 28.58 71.7 559087.09 944167.06 5350.2541 16.8122
793 28.68 64.7 556142.06 941199.49 5367.9408 17.0705
794 28.92 57.0 NaN NaN 5406.5887 21.6579
[795 rows x 127 columns]
Applying the correct TCODE to each variable¶
To apply the appropriate transformation to each variable, I built a function that reads the transformation code (TCODE) associated with each series and applies the corresponding operation.
These codes are provided in the first row of the original dataset and were stored separately for clarity.
The function loops through all columns (except the Date), looks up the TCODE for each one, and applies the correct transformation using a dictionary that maps each code to a specific operation. This approach ensures that each variable is treated consistently and according to the definitions used in the FRED-MD dataset.
The column TCODE denotes the following data transformation for a series $x$:
$\text{no transformation}$;
$\Delta x_t$;
$\Delta^2 x_t$;
$log(x_t)$;
$\Delta log(x_t)$;
$\Delta^2 log(x_t)$;
$\Delta(x_t / x_{t-1} - 1.0)$.
tcode_functions = {
1: lambda x: x, # (1) Level
2: lambda x: x.diff(), # (2) Δx
3: lambda x: x.diff().diff(), # (3) Δ²x
4: lambda x: np.log(x), # (4) log(x)
5: lambda x: np.log(x).diff(), # (5) Δlog(x)
6: lambda x: np.log(x).diff().diff(), # (6) Δ²log(x)
7: lambda x: (x/x.shift(1)-1).diff() # (7) Δ(x/x₋₁ - 1)
}
def apply_transformations(df, tcode_df):
df_transformed = df.copy()
for col in df.columns:
if col == 'Date':
continue
tcode = int(tcode_df[col])
func = tcode_functions.get(tcode)
if func:
df_transformed[col] = func(df[col])
else:
print(f"Warning: No transformation defined for TCODE {tcode} in column {col}")
return df_transformed
# Apply the transformations
df_transformed = apply_transformations(df, tcode_df.loc['TCODE'])
# Now remove the first 2 rows
df_transformed = df_transformed.iloc[2:].reset_index(drop=True)
Another important thing to do is check the -inf with NaN, since they may be the result of log(0).
# Count +inf values per column
pos_inf_count = (df_transformed == np.inf).sum()
# Count -inf values per column
neg_inf_count = (df_transformed == -np.inf).sum()
# Display only columns where +inf or -inf values are present
print("Columns with +inf values:\n", pos_inf_count[pos_inf_count > 0])
print("\nColumns with -inf values:\n", neg_inf_count[neg_inf_count > 0])
Columns with +inf values: Series([], dtype: int64) Columns with -inf values: Series([], dtype: int64)
So there are no inf values
df_transformed.set_index('Date', inplace=True)
print(df_transformed)
RPI W875RX1 DPCERA3M086SBEA CMRMTSPLx RETAILx \
Date
1959-03-01 0.006457 0.007325 0.009404 -0.003374 0.008321
1959-04-01 0.006510 0.007029 -0.003622 0.019915 0.000616
1959-05-01 0.005796 0.006618 0.012043 0.006839 0.007803
1959-06-01 0.003068 0.003012 0.003642 -0.000097 0.009064
1959-07-01 -0.000580 -0.000762 -0.003386 0.012155 -0.000330
... ... ... ... ... ...
2024-11-01 0.001798 0.002507 0.004463 0.004270 0.006384
2024-12-01 0.000520 0.000665 0.005548 0.008358 0.007717
2025-01-01 0.002352 0.000214 -0.003769 -0.009564 -0.008678
2025-02-01 0.002992 -0.000104 0.001043 0.008630 0.000308
2025-03-01 0.005030 0.006744 0.006974 NaN 0.014431
INDPRO IPFPNSS IPFINAL IPCONGD IPDCONGD ... \
Date ...
1959-03-01 0.014300 0.006036 0.004896 0.000000 0.019397 ...
1959-04-01 0.021080 0.014339 0.014545 0.015652 0.006379 ...
1959-05-01 0.014954 0.008267 0.009580 0.004766 0.020152 ...
1959-06-01 0.001137 0.007035 0.007125 -0.004766 0.007453 ...
1959-07-01 -0.024237 0.001168 0.008251 0.013056 0.019609 ...
... ... ... ... ... ... ...
2024-11-01 -0.002467 -0.002353 -0.002039 -0.006307 0.014193 ...
2024-12-01 0.011272 0.011175 0.011175 0.008074 -0.015573 ...
2025-01-01 0.002171 0.005745 0.006580 0.004916 -0.020199 ...
2025-02-01 0.008464 0.007360 0.007976 0.005237 0.043469 ...
2025-03-01 -0.003181 -0.001541 -0.002875 -0.009697 0.005102 ...
DNDGRG3M086SBEA DSERRG3M086SBEA CES0600000008 CES2000000008 \
Date
1959-03-01 -0.001148 0.000292 -0.000022 -0.008147
1959-04-01 0.001312 0.001760 -0.000022 0.012203
1959-05-01 -0.001695 -0.001867 -0.000021 -0.004090
1959-06-01 0.003334 0.001946 -0.004619 0.003992
1959-07-01 -0.001204 -0.000013 0.000000 -0.004040
... ... ... ... ...
2024-11-01 0.000117 -0.002130 -0.000639 -0.002760
2024-12-01 0.004218 0.002179 0.002206 0.004953
2025-01-01 0.001602 -0.000697 0.001865 -0.001115
2025-02-01 -0.004609 0.002306 -0.003156 -0.000831
2025-03-01 -0.008425 -0.003886 0.003725 0.000808
CES3000000008 UMCSENTx DTCOLNVHFNM DTCTHFNM INVEST VIXCLSx
Date
1959-03-01 0.004819 NaN 0.004929 0.004138 -0.014792 NaN
1959-04-01 -0.004890 NaN 0.012134 0.006734 0.024929 NaN
1959-05-01 -0.004819 NaN 0.002828 0.002020 -0.015342 NaN
1959-06-01 0.004796 NaN 0.009726 0.009007 -0.012252 NaN
1959-07-01 -0.004796 NaN -0.004631 -0.001000 0.029341 NaN
... ... ... ... ... ... ...
2024-11-01 0.003190 1.3 -0.000872 -0.001684 -0.011910 15.9822
2024-12-01 -0.001439 2.2 0.004047 0.004152 0.002154 15.6997
2025-01-01 0.004895 -2.3 -0.006509 -0.004750 -0.000311 16.8122
2025-02-01 -0.005293 -7.0 -0.004785 -0.003871 0.006364 17.0705
2025-03-01 0.004841 -7.7 NaN NaN 0.003874 21.6579
[793 rows x 126 columns]
Outliers¶
Now let's replace the outliers with NaN
An outlier is defined as an observation that deviates from the sample median by more than ten interquartile ranges.
# Medians and interquartile ranges
median = df_transformed.median()
Q1 = df_transformed.quantile(0.25)
Q3 = df_transformed.quantile(0.75)
IQR = Q3 - Q1
# Absolute distance from median
diff = (df_transformed - median).abs()
# Boolean mask for outliers
outlier_mask = diff > (10 * IQR)
# Removing outliers and setting them to NaN
df_transformed[outlier_mask] = np.nan
tot_outliers = outlier_mask.sum().sum()
print(f'Outliers found and set NaN:\n{tot_outliers}')
Outliers found and set NaN: 159
Iterative Expectation-Maximization algorithm¶
Estimation of the static factors by PCA adapted to allow for missing values. This is essentially the EM algorithm given in Stock and Watson (2002).
Observations that are missing are initialized to the unconditional mean based on the non-missing values (which is zero since the data are demeaned and standardized) so that the panel is re-balanced
Now let's check for NaN values
nan_count = df_transformed.isna().sum()
print("NaN values per column:\n", nan_count[nan_count > 0])
NaN values per column:
RPI 7
W875RX1 2
DPCERA3M086SBEA 3
CMRMTSPLx 2
RETAILx 2
...
CUSR0000SAS 1
UMCSENTx 227
DTCOLNVHFNM 9
DTCTHFNM 7
VIXCLSx 40
Length: 72, dtype: int64
In this section, I estimate the latent static factors from the transformed FRED-MD dataset using an Expectation-Maximization Principal Component Analysis (EM-PCA) procedure. This method follows the approach described in McCracken and Ng (2015).
The estimation technique is based on the algorithm proposed by Stock and Watson (2002), which allows principal component analysis to be performed in the presence of NaN data by iteratively imputing missing values and re-estimating factors until convergence.
Computing the mean for each variable¶
# Now let's compute the unconditional mean for each variable excluding the NaN values
nan_mean = df_transformed.mean(skipna=True)
print(nan_mean)
RPI 0.002520
W875RX1 0.002575
DPCERA3M086SBEA 0.002776
CMRMTSPLx 0.002326
RETAILx 0.004653
...
UMCSENTx -0.047173
DTCOLNVHFNM -0.000008
DTCTHFNM 0.000005
INVEST 0.000019
VIXCLSx 19.293502
Length: 126, dtype: float64
Filling the NaN with the mean¶
df_nan = df_transformed.copy()
# Let's substitute the NaN with the mean of the variable
for col in nan_mean.index:
df_nan[col] = df_nan[col].fillna(nan_mean[col])
Standardizing the Data¶
Standardize each variable using:
$$ z_{it} = \frac{x_{it} - \mu_i}{\sigma_i} $$
This ensures that each variable has mean 0 and standard deviation 1.
# Now compute standard deviation and the mean on the filled version
std = df_nan.std()
mean = df_nan.mean()
# Create a new standardized DataFrame
df_standardized = df_nan.copy()
# Standardize each column (subtract mean and divide by std)
for col in mean.index:
df_standardized[col] = (df_nan[col] - mean[col]) / std[col]
print(df_standardized)
RPI W875RX1 DPCERA3M086SBEA CMRMTSPLx RETAILx \
Date
1959-03-01 0.743417 0.913692 1.177891 -0.490603 0.278756
1959-04-01 0.753564 0.856719 -1.136802 1.514077 -0.306780
1959-05-01 0.618746 0.777659 1.646691 0.388474 0.239408
1959-06-01 0.103557 0.083970 0.153936 -0.208534 0.335227
1959-07-01 -0.585383 -0.642070 -1.094952 0.846037 -0.378658
... ... ... ... ... ...
2024-11-01 -0.136278 -0.013218 0.299764 0.167340 0.131565
2024-12-01 -0.377567 -0.367449 0.492706 0.519265 0.232856
2025-01-01 -0.031653 -0.454366 -1.162997 -1.023462 -1.013013
2025-02-01 0.089212 -0.515402 -0.307847 0.542640 -0.330181
2025-03-01 0.474014 0.801900 0.746027 0.000000 0.743067
INDPRO IPFPNSS IPFINAL IPCONGD IPDCONGD ... \
Date ...
1959-03-01 1.432224 0.498939 0.314042 -0.168161 0.675422 ...
1959-04-01 2.229303 1.531037 1.396674 1.458580 0.160896 ...
1959-05-01 1.509062 0.776197 0.839660 0.327197 0.705275 ...
1959-06-01 -0.115156 0.623132 0.564228 -0.663519 0.203325 ...
1959-07-01 -3.098162 -0.106174 0.690570 1.188749 0.683799 ...
... ... ... ... ... ... ...
2024-11-01 -0.538955 -0.543769 -0.464048 -0.823675 0.469722 ...
2024-12-01 1.076242 1.137695 1.018633 0.670981 -0.706824 ...
2025-01-01 0.006333 0.462745 0.503050 0.342751 -0.889667 ...
2025-02-01 0.746155 0.663549 0.659610 0.376176 1.626893 ...
2025-03-01 -0.622839 -0.442876 -0.557812 -1.175995 0.110389 ...
DNDGRG3M086SBEA DSERRG3M086SBEA CES0600000008 CES2000000008 \
Date
1959-03-01 -0.185725 0.183861 -0.006168 -0.922779
1959-04-01 0.215603 1.106570 -0.006116 1.382487
1959-05-01 -0.274979 -1.173973 -0.006065 -0.463208
1959-06-01 0.545536 1.223616 -1.178946 0.452264
1959-07-01 -0.194877 -0.008207 -0.000597 -0.457593
... ... ... ... ...
2024-11-01 0.020714 -1.339505 -0.163703 -0.312603
2024-12-01 0.689762 1.370221 0.562088 0.561171
2025-01-01 0.262880 -0.438692 0.475253 -0.126262
2025-02-01 -0.750521 1.450354 -0.805624 -0.094019
2025-03-01 -1.373094 -2.444361 0.949592 0.091644
CES3000000008 UMCSENTx DTCOLNVHFNM DTCTHFNM INVEST \
Date
1959-03-01 1.023820 0.000000 2.605637e-01 3.884093e-01 -1.369033
1959-04-01 -1.040701 0.000000 6.408395e-01 6.323378e-01 2.302433
1959-05-01 -1.025764 0.000000 1.496799e-01 1.893801e-01 -1.419858
1959-06-01 1.019002 0.000000 5.137570e-01 8.458715e-01 -1.134212
1959-07-01 -1.020848 0.000000 -2.440059e-01 -9.439948e-02 2.710237
... ... ... ... ... ...
2024-11-01 0.677391 0.393806 -4.561051e-02 -1.587556e-01 -1.102657
2024-12-01 -0.306973 0.656894 2.140096e-01 3.896428e-01 0.197339
2025-01-01 1.040122 -0.658546 -3.431704e-01 -4.468299e-01 -0.030556
2025-02-01 -1.126504 -2.032450 -2.521521e-01 -3.642178e-01 0.586412
2025-03-01 1.028436 -2.237074 1.385908e-18 2.387832e-18 0.356265
VIXCLSx
Date
1959-03-01 0.000000
1959-04-01 0.000000
1959-05-01 0.000000
1959-06-01 0.000000
1959-07-01 0.000000
... ...
2024-11-01 -0.481744
2024-12-01 -0.522844
2025-01-01 -0.360992
2025-02-01 -0.323413
2025-03-01 0.343984
[793 rows x 126 columns]
nan_count_standardized = df_standardized.isna().sum()
print("NaN values per column:\n", nan_count_standardized[nan_count_standardized > 0])
NaN values per column: Series([], dtype: int64)
Factors¶
Starting from the data panel (with missing values initialized to zero), we estimate:
A matrix of factors:
$$ F = (f_1, \dots, f_T)' \in \mathbb{R}^{T \times r} $$A matrix of loadings:
$$ \lambda = (\lambda_1, \dots, \lambda_N)' \in \mathbb{R}^{N \times r} $$
These are estimated using the normalization: $$ \lambda' \lambda / N = I_r $$
Where:
- $T$ = number of time periods (rows),
- $N$ = number of series (columns),
- $r$ = number of factors.
The missing value for series i at time t is updated from zero to $\lambda'_i f_t$.
This is multiplied by the standard deviation of the series and the mean is re-added.
The resulting value is treated as an observation for series i at time t, and the mean and variance of the complete sample are re-calculated. The data are demeaned and standardized again, and the factors and loadings are re-estimated from the updated panel. The iteration stops when the factor estimates do not change.
$PC_p$ criteria developed in Bai and Ng (2002) is used, which is a generalization of Mallow’s $C_p$ criteria for large dimensional panels.
The penalty used is:
$$ (N+T)/N T \log(\text{min}(N,T)) $$
# 1. PCA with normalization λ'λ / N = I_r
def pca_stock_watson(X_std: np.ndarray, r: int):
"""
Parameters
----------
X_std : ndarray (T × N) – columns already demeaned and standardized
r : number of factors
Returns
-------
F_hat : (T × r) – estimated factors
Lambda_hat : (N × r) – estimated loadings
X_hat : reconstruction F̂ Λ̂' (T × N)
sing_vals : singular values of X'X
"""
T, N = X_std.shape
U, s, _ = svd(X_std.T @ X_std, full_matrices=False)
Lambda_hat = np.sqrt(N) * U[:, :r]
F_hat = (X_std @ Lambda_hat) / N
X_hat = F_hat @ Lambda_hat.T
return F_hat, Lambda_hat, X_hat, s
# 2. Bai & Ng 2002 Criterion
def pc_p2_criterion(X_std: np.ndarray, kmax: int = 15) -> int:
"""
Returns r* ∈ {0, …, kmax} that minimizes the PC_p2 criterion.
"""
T, N = X_std.shape
NT, NT1 = T * N, N + T
log_pen = np.log(min(N, T))
crit = np.empty(kmax + 1)
for r in range(kmax + 1):
if r == 0:
X_hat = np.zeros_like(X_std)
else:
X_hat = pca_stock_watson(X_std, r)[2]
sigma2 = np.sum((X_std - X_hat) ** 2) / NT
pen = 0 if r == 0 else (NT1 / NT) * log_pen * r
crit[r] = np.log(sigma2) + pen
return int(np.argmin(crit))
# 3. EM algorithm
def em_factors(
df_transformed: pd.DataFrame, # original NaNs
df_missing: pd.DataFrame, # NaNs → unconditional mean (not standardized)
df_standardized: pd.DataFrame, # z-score (demeaned + std), no NaNs
*,
kmax: int = 15,
tol: float = 1e-6,
max_iter: int = 200,
print_every: int = 10,
):
"""
Python port of the MATLAB function `factors_em` (only DEMEAN=2).
Returns:
factors_df, loadings_df, r_star,
X_filled_unstd_df, X_filled_std_df, X_hat_df
"""
# setup
idx, cols = df_transformed.index, df_transformed.columns
mask_nan = df_transformed.isna().to_numpy() # True on original NaNs
X_std = df_standardized.to_numpy(float) # (T × N)
X_hat_prev = np.zeros_like(X_std)
err, it = np.inf, 0
# EM loop
while err > tol and it < max_iter:
it += 1
# 1. select r* with PC_p2
r_star = pc_p2_criterion(X_std, kmax)
# 2. Stock-Watson PCA
F_hat, Lambda_hat, X_hat, _ = pca_stock_watson(X_std, r_star)
# 3. convergence criterion
if it > 1:
err = np.sum((X_hat - X_hat_prev) ** 2) / np.sum(X_hat_prev ** 2)
X_hat_prev = X_hat.copy()
if it == 1 or it % print_every == 0:
print(f"Iter {it:3d} | r*={r_star:2d} | err={err:9.2e}")
# 4. BACK-TRANSFORM with CURRENT mean/std
mean_curr = X_std.mean(axis=0)
std_curr = X_std.std(axis=0, ddof=0)
std_curr[std_curr == 0] = 1.0
X_unstd = X_std * std_curr + mean_curr # original scale
updates = X_hat * std_curr + mean_curr # predictions in original scale
X_unstd[mask_nan] = updates[mask_nan] # replace only NaNs
# 5. re-standardize for next iteration
mean_next = X_unstd.mean(axis=0)
std_next = X_unstd.std(axis=0, ddof=0)
std_next[std_next == 0] = 1.0
X_std = (X_unstd - mean_next) / std_next
# output
factors_df = pd.DataFrame(
F_hat, index=idx, columns=[f"F{i+1}" for i in range(r_star)]
)
loadings_df = pd.DataFrame(
Lambda_hat, index=cols, columns=[f"F{i+1}" for i in range(r_star)]
)
X_filled_unstd_df = pd.DataFrame(X_unstd, index=idx, columns=cols)
X_filled_std_df = pd.DataFrame(X_std, index=idx, columns=cols)
X_hat_df = pd.DataFrame(X_hat, index=idx, columns=cols)
if it == max_iter:
print(f" EM: maximum number of iterations reached ({max_iter}), err = {err:.2e}")
else:
print(f"Converged in {it} iterations (err = {err:.2e})")
return (
factors_df,
loadings_df,
r_star,
X_filled_unstd_df,
X_filled_std_df,
X_hat_df,
)
factors, loadings, r_opt, X_f, X_f_std, xhat = em_factors(
df_transformed = df_transformed, # original NaNs
df_missing = df_nan, # NaNs → unconditional mean
df_standardized= df_standardized, # z-score
kmax = 15, # same as in the MATLAB script
tol = 1e-6,
max_iter = 200,
print_every = 10 # log every 10 iterations
)
Iter 1 | r*= 7 | err= inf Iter 10 | r*= 7 | err= 2.08e-04 Iter 20 | r*= 7 | err= 3.80e-05 Iter 30 | r*= 7 | err= 9.02e-06 Iter 40 | r*= 7 | err= 2.57e-06 Converged in 49 iterations (err = 9.09e-07)
print("Optimal number of factors (r*) =", r_opt, "\n")
# First 5 rows of the factors (T × r)
print("=== FACTORS ===")
print(factors.head())
print("\n=== LOADINGS ===")
print(loadings.head())
Optimal number of factors (r*) = 7
=== FACTORS ===
F1 F2 F3 F4 F5 F6 \
Date
1959-03-01 -0.634074 -0.154981 0.012307 -0.109969 -0.135474 0.063457
1959-04-01 -0.672058 -0.087580 -0.039489 0.025380 -0.150690 0.210907
1959-05-01 -0.434015 -0.089333 0.030766 0.024450 -0.151005 0.072027
1959-06-01 -0.181955 -0.056294 -0.147395 0.112033 -0.112040 0.050307
1959-07-01 0.189512 -0.170787 -0.216749 0.020062 -0.254072 -0.007770
F7
Date
1959-03-01 0.079084
1959-04-01 -0.067066
1959-05-01 -0.047864
1959-06-01 0.011220
1959-07-01 0.139868
=== LOADINGS ===
F1 F2 F3 F4 F5 F6 \
RPI -0.804890 -0.522683 0.594732 -0.445156 0.311571 -0.102130
W875RX1 -1.323651 -0.542498 0.824017 -0.514824 0.399597 0.181471
DPCERA3M086SBEA -1.318999 0.374147 0.722967 -0.682692 0.372571 -1.507343
CMRMTSPLx -1.657474 0.268653 0.864031 -0.507607 0.558627 -0.834430
RETAILx -1.366324 0.783677 0.184843 -0.477917 0.653177 -1.274763
F7
RPI 0.136943
W875RX1 0.345291
DPCERA3M086SBEA 0.468549
CMRMTSPLx 0.250298
RETAILx 0.634052
# Number of factors (i.e., columns of the DataFrame)
num_factors = factors.shape[1]
colors = plt.get_cmap('tab10').colors # palette with at least 10 colors
# Create a figure with as many subplots as rows (i.e., number of factors)
fig, axes = plt.subplots(nrows=num_factors, ncols=1, figsize=(12, 2.5 * num_factors), sharex=False, dpi = 600)
for i, ax in enumerate(axes):
col = colors[i % len(colors)]
ax.plot(factors.index, factors.iloc[:, i], color=col)
ax.set_ylabel(f'F{i+1}')
ax.grid(False)
axes[-1].set_xlabel('Date')
fig.tight_layout()
fig.suptitle("Factors", fontsize=16)
fig.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()
NaN have been filled¶
Also NaN values, now, have been filled thanks to this process
Now let's check if the procedure has succeed, running the line below should show False
X_f.isna().any().any()
False
And we also have its standardized version X_std
X_f_std.isna().any().any()
False
Regression, $R^2$, and $mR^2$¶
Cumulative regression for each series
For each series $x_i$ (i.e., each column of the dataset, such as inflation, employment, etc.), do the following:
- Regression on $F1$
$$ x_{t,i} = \alpha_i + \beta_{i,1} F_{1,t} + \varepsilon_{i,t} $$
→ Let's save $R_i(1)^{2}$
- Regression on $F1 + F2$
$$ x_{t,i} = \alpha_i + \beta_{i,1} F_{1,t} + \beta_{i,2} F_{2,t} + \varepsilon_{i,t} $$
→ Let's save $R_i(2)^{2}$
- Regression on $F1 + F2 + F3$
$$ x_{t,i} = \alpha_i + \beta_{i,1} F_{1,t} + \beta_{i,2} F_{2,t} + \beta_{i,t} F_{3,t} + \varepsilon_{i,t} $$
→ Let's save $R_i(3)^{2}$
- we continue up to $r$, the optimal number of factors
NOTE:
lstsqsolves solves the least squares problem but it is numerically stable and optimized and handles multiple dependent variables in a single call.It avoids manual matrix inversion.
T, N = X_f_std.shape
r = factors.shape[1]
X_np_std = X_f_std.to_numpy()
F_np = factors.to_numpy()
columns = X_f_std.columns
y_mean = np.mean(X_np_std, axis=0)
SST = np.sum((X_np_std - y_mean)**2, axis=0)
# Initialize DataFrame with index = variable names
R2_matrix = pd.DataFrame(index=columns)
# Loop over k factors
for k in range(1, r + 1):
F_k = F_np[:, :k] # first k factors
beta_hat, *_ = np.linalg.lstsq(F_k, X_np_std, rcond=None)
y_hat = F_k @ beta_hat
SSR = np.sum((y_hat - y_mean)**2, axis=0)
R2 = SSR / SST
# Add column for cumulative R² with k factors
R2_matrix[f'R2_F{k}'] = R2
Now we have a matrix R2_matrix containing the $R^2$ (computed as stated before) on the columns and the variables on the rows
print(R2_matrix.head())
R2_F1 R2_F2 R2_F3 R2_F4 R2_F5 R2_F6 \
RPI 0.110971 0.131306 0.155288 0.165859 0.170515 0.170852
W875RX1 0.299966 0.321856 0.367886 0.382022 0.389687 0.390745
DPCERA3M086SBEA 0.297816 0.308254 0.343691 0.368542 0.375204 0.448385
CMRMTSPLx 0.470080 0.475465 0.526062 0.539802 0.554758 0.577185
RETAILx 0.319538 0.365299 0.367627 0.379809 0.400234 0.452580
R2_F7
RPI 0.171356
W875RX1 0.393956
DPCERA3M086SBEA 0.454303
CMRMTSPLx 0.578868
RETAILx 0.463431
Compute marginals for each series
After obtaining all $R_i(k)^{2}$, compute for each $k$ and each series:
- $mR_i(1)^{2} = R_i(1)^{2}$
- $mR_i(2)^{2} = R_i(2)^{2} - R_i(1)^{2}$
- $mR_i(3)^{2} = R_i(3)^{2} - R_i(2)^{2}$
- ... up to $mR_i(r)^{2}$
mR2_matrix = R2_matrix.copy()
# Marginals R²
mR2_matrix.iloc[:, 1:] = R2_matrix.iloc[:, 1:].values - R2_matrix.iloc[:, :-1].values
print(mR2_matrix.head())
R2_F1 R2_F2 R2_F3 R2_F4 R2_F5 R2_F6 \
RPI 0.110971 0.020335 0.023983 0.010571 0.004656 0.000337
W875RX1 0.299966 0.021890 0.046030 0.014137 0.007664 0.001058
DPCERA3M086SBEA 0.297816 0.010438 0.035437 0.024850 0.006662 0.073181
CMRMTSPLx 0.470080 0.005385 0.050597 0.013741 0.014956 0.022427
RETAILx 0.319538 0.045761 0.002328 0.012182 0.020425 0.052346
R2_F7
RPI 0.000504
W875RX1 0.003211
DPCERA3M086SBEA 0.005918
CMRMTSPLx 0.001684
RETAILx 0.010851
Average per factor
Finally, for each factor $k$, compute:
$$ mR(k)^{2} = \frac{1}{N} \sum_{i=1}^{N} mR_i(k)^{2} $$
# Mean for each factor (i.e., by column)
mR2_average = mR2_matrix.mean(axis=0)
mR2_average.name = 'mean_mR2'
print(mR2_average)
R2_F1 0.171063 R2_F2 0.074508 R2_F3 0.067727 R2_F4 0.053311 R2_F5 0.047765 R2_F6 0.032216 R2_F7 0.027055 Name: mean_mR2, dtype: float64
Series that “load the most” on each factor
For each factor $k$:
- sort the series by $mR_i(k)^{2}$
- take the top 10
top10_by_factor = {}
for col in mR2_matrix.columns:
top10_series = mR2_matrix[col].sort_values(ascending=False).head(10)
top10_by_factor[col] = top10_series
print("Top 10 series for F1:")
print(top10_by_factor['R2_F1'])
Top 10 series for F1: IPMANSICS 0.802027 PAYEMS 0.783759 INDPRO 0.761174 IPFPNSS 0.756845 CUMFNS 0.749975 USGOOD 0.742252 MANEMP 0.690483 IPFINAL 0.688075 DMANEMP 0.645203 IPDMAT 0.625748 Name: R2_F1, dtype: float64
Total Variation Explained by the factors
xhat_np = xhat.to_numpy(float)
# Total Variation Explained
SSE = np.square(X_np_std - xhat_np).sum() # Resigual sum of squares
SST = np.square(X_np_std).sum() # Total square sum
TVE = 1 - SSE / SST
print(f"Total Variation Explained: {TVE:.4f}")
Total Variation Explained: 0.4736
Table¶
# Number of factors
r = len(top10_by_factor)
# List of DataFrames (one per factor)
col_blocks = []
for k in range(1, r + 1):
factor_label = f'F{k}'
col_label = f'R2_F{k}'
col_name = f'mR²({k})'
# Top 10 series and values
top10 = top10_by_factor[col_label]
# Create the column for this factor
col_df = pd.DataFrame({
(col_name, 'Series'): [''] + list(top10.index),
(col_name, 'mR²'): [f"{mR2_average[col_label]:.3f}"] + [f"{v:.3f}" for v in top10.values]
})
col_blocks.append(col_df)
# Concatenate all columns (axis=1 → side-by-side columns)
table_df = pd.concat(col_blocks, axis=1)
# Assign index: 'mean', '1', ..., '10'
table_df.index = ['mean'] + [str(i) for i in range(1, 11)]
print(f"Total Variation Explained: {TVE:.3f}")
HTML(table_df.to_html(escape=False))
Total Variation Explained: 0.474
| mR²(1) | mR²(2) | mR²(3) | mR²(4) | mR²(5) | mR²(6) | mR²(7) | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Series | mR² | Series | mR² | Series | mR² | Series | mR² | Series | mR² | Series | mR² | Series | mR² | |
| mean | 0.171 | 0.075 | 0.068 | 0.053 | 0.048 | 0.032 | 0.027 | |||||||
| 1 | IPMANSICS | 0.802 | CUSR0000SAC | 0.601 | BAAFFM | 0.417 | GS1 | 0.618 | T1YFFM | 0.538 | AWHMAN | 0.365 | S&P 500 | 0.444 |
| 2 | PAYEMS | 0.784 | CUSR0000SA0L2 | 0.587 | AAAFFM | 0.414 | GS5 | 0.612 | TB6SMFFM | 0.517 | CES0600000007 | 0.362 | S&P div yield | 0.324 |
| 3 | INDPRO | 0.761 | DNDGRG3M086SBEA | 0.562 | T10YFFM | 0.377 | GS10 | 0.558 | T5YFFM | 0.491 | UEMP15OV | 0.315 | S&P PE ratio | 0.253 |
| 4 | IPFPNSS | 0.757 | CPIAUCSL | 0.556 | T5YFFM | 0.315 | TB6MS | 0.550 | TB3SMFFM | 0.456 | UEMP27OV | 0.173 | UMCSENTx | 0.177 |
| 5 | CUMFNS | 0.750 | CPITRNSL | 0.546 | HOUST | 0.310 | AAA | 0.526 | T10YFFM | 0.418 | BUSINVx | 0.143 | VIXCLSx | 0.167 |
| 6 | USGOOD | 0.742 | CUSR0000SA0L5 | 0.533 | PERMIT | 0.260 | BAA | 0.448 | AAAFFM | 0.341 | S&P PE ratio | 0.142 | EXCAUSx | 0.120 |
| 7 | MANEMP | 0.690 | PCEPI | 0.520 | HOUSTW | 0.258 | TB3MS | 0.447 | BAAFFM | 0.269 | UEMP15T26 | 0.140 | NONBORRES | 0.085 |
| 8 | IPFINAL | 0.688 | CPIULFSL | 0.483 | PERMITW | 0.257 | CP3Mx | 0.431 | COMPAPFFx | 0.247 | UEMPMEAN | 0.118 | TOTRESNS | 0.081 |
| 9 | DMANEMP | 0.645 | WPSFD49502 | 0.424 | HOUSTS | 0.237 | TWEXAFEGSMTHx | 0.227 | PERMIT | 0.231 | MANEMP | 0.088 | BOGMBASE | 0.079 |
| 10 | IPDMAT | 0.626 | WPSFD49207 | 0.407 | TB3SMFFM | 0.233 | FEDFUNDS | 0.173 | PERMITW | 0.211 | USWTRADE | 0.087 | TB3MS | 0.060 |
LaTex¶
If you want to produce the tabe shown before you must run thi cell which will generate a LaTex code that you can paste in your LaTex script
NOTE
This code is dynamic in such a way you can just run the cell below with the results obtained from the code
def texify(s: str) -> str:
"""Escape &, _, %, # and preserve spaces."""
return re.sub(r'([&_#%])', r'\\\1', s)
# Complete DataFrame
r = len(top10_by_factor)
headers = []
for k in range(1, r + 1):
h = rf'\emph{{$mR^2({k})$}}'
headers.extend([(h, 'Series'), (h, r'$mR^2$')])
cols = pd.MultiIndex.from_tuples(headers)
idx = ['mean'] + list(map(str, range(1, 11)))
big = pd.DataFrame('', index=idx, columns=cols)
# “mean” row
for k in range(1, r + 1):
key = f'R2_F{k}'
big.loc['mean', (rf'\emph{{$mR^2({k})$}}', r'$mR^2$')] = f'{mR2_average[key]:.3f}'
# top-10 rows
for k in range(1, r + 1):
key = f'R2_F{k}'
for i, (name, val) in enumerate(top10_by_factor[key].items(), start=1):
big.loc[str(i), (rf'\emph{{$mR^2({k})$}}', 'Series')] = texify(name)
big.loc[str(i), (rf'\emph{{$mR^2({k})$}}', r'$mR^2$')] = f'{val:.3f}'
# split (1-4) / (5-7)
left = big.iloc[:, :8] # 4 factors × 2 columns
right = big.iloc[:, 8:] # 3 factors × 2 columns
colfmt_left = 'l' + (' l r' * 4)
colfmt_right = 'l' + (' l r' * 3)
latex_left = left.to_latex(
escape=False, multicolumn=True, multicolumn_format='c',
index=True, column_format=colfmt_left
)
latex_right = right.to_latex(
escape=False, multicolumn=True, multicolumn_format='c',
index=True, column_format=colfmt_right
)
with open('factor_marginals_left.tex', 'w') as f:
f.write(latex_left)
with open('factor_marginals_right.tex', 'w') as f:
f.write(latex_right)
# macro with the TVE
TVE_str = f"{TVE:.3f}"
with open('TVE_value.tex', 'w') as f:
f.write(rf'\newcommand{{\TVEValue}}{{{TVE_str}}}')
print("Created: factor_marginals_left.tex, factor_marginals_right.tex, TVE_value.tex")
Created: factor_marginals_left.tex, factor_marginals_right.tex, TVE_value.tex
Figure 1 in the paper¶
group_dict = {
1: ["RPI","W875RX1","INDPRO","IPFPNSS","IPFINAL","IPCONGD","IPDCONGD",
"IPNCONGD","IPBUSEQ","IPMAT","IPDMAT","IPNMAT","IPMANSICS","IPB51222s",
"IPFUELS","NAPMPI","CUMFNS"],
2: ["HWI","HWIURATIO","CLF16OV","CE16OV","UNRATE","UEMPMEAN","UEMPLT5",
"UEMP5TO14","UEMP15OV","UEMP15T26","UEMP27OV","CLAIMSx","PAYEMS",
"USGOOD","CES1021000001","USCONS","MANEMP","DMANEMP","NDMANEMP",
"SRVPRD","USTPU","USWTRADE","USTRADE","USFIRE","USGOVT",
"CES0600000007","AWOTMAN","AWHMAN","NAPMEI","CES0600000008",
"CES2000000008","CES3000000008"],
3: ["HOUST","HOUSTNE","HOUSTMW","HOUSTS","HOUSTW",
"PERMIT","PERMITNE","PERMITMW","PERMITS","PERMITW"],
4: ["DPCERA3M086SBEA","CMRMTSPLx","RETAILx","NAPM","NAPMNOI","NAPMSDI",
"NAPMII","ACOGNO","AMDMNOx","ANDENOx","AMDMUOx","BUSINVx",
"ISRATIOx","UMCSENTx"],
5: ["M1SL","M2SL","M2REAL","AMBSL","TOTRESNS","NONBORRES","BUSLOANS",
"REALLN","NONREVSL","CONSPI","MZMSL","DTCOLNVHFNM","DTCTHFNM","INVEST"],
6: ["FEDFUNDS","CP3Mx","TB3MS","TB6MS","GS1","GS5","GS10","AAA","BAA",
"COMPAPFFx","TB3SMFFM","TB6SMFFM","T1YFFM","T5YFFM","T10YFFM",
"AAAFFM","BAAFFM","TWEXMMTH","EXSZUSx","EXJPUSx","EXUSUKx","EXCAUSx"],
7: ["PPIFGS","PPIFCG","PPIITM","PPICRM","OILPRICEx","PPICMM","NAPMPRI",
"CPIAUCSL","CPIAPPSL","CPITRNSL","CPIMEDSL","CUSR0000SAC","CUUR0000SAD",
"CUSR0000SAS","CPIULFSL","CUUR0000SA0L2","CUSR0000SA0L5","PCEPI",
"DDURRG3M086SBEA","DNDGRG3M086SBEA","DSERRG3M086SBEA"],
8: ["S&P 500","S&P: indust","S&P div yield","S&P PE ratio"],
}
def norm(name: str) -> str:
"""Lowercase, letters/digits only (removes spaces, :, &, etc.)."""
return re.sub(r'[^a-z0-9]', '', name.lower())
R2_last = R2_matrix.iloc[:, -1] # R² with r_opt factors
R2_last_norm = R2_last.copy()
R2_last_norm.index = R2_last_norm.index.map(norm)
# Mapping norm_name → original name (for optional labels)
norm_to_original = {norm(v): v for g in group_dict.values() for v in g}
# ---------------------------------------------
# Building ordered lists
# ---------------------------------------------
ordered_vars, group_codes, missing = [], [], []
for g in range(1, 9): # keep group order 1…8
for v in group_dict[g]:
key = norm(v)
if key in R2_last_norm.index:
ordered_vars.append(key)
group_codes.append(g)
else:
missing.append(v) # for debugging only
if missing:
print("[WARN] Series not found in R2_matrix:")
print(missing)
R2_ord = R2_last_norm.loc[ordered_vars].to_numpy()
groups_ord = np.array(group_codes)
labels_ord = [norm_to_original[k] for k in ordered_vars]
# ---------------------------------------------
# Palette and colors for bar chart
# ---------------------------------------------
cmap = plt.colormaps["tab10"] # Modern API
colors = cmap((groups_ord - 1) % 10)
# ---------------------------------------------
# Plot
# ---------------------------------------------
fig, ax = plt.subplots(figsize=(13, 4), dpi=300)
ax.bar(range(len(R2_ord)), R2_ord, color=colors, edgecolor="black", lw=.15)
ax.set_ylabel("R²")
ax.set_ylim(0, 1)
ax.set_title(f"R²({r_opt}) ordered by group")
# Group separation and alternating bands
ticks, labels = [], []
start = 0
for g in range(1, 9):
size = group_codes.count(g)
if size == 0:
continue
end = start + size - 1
# grey band for even groups
if g % 2 == 0:
ax.axvspan(start-0.5, end+0.5, color="lightgrey", alpha=.20)
# separator line
ax.axvline(end + 0.5, color="black", lw=1.6)
ticks.append((start + end) / 2)
labels.append(f"Group {g}")
start += size
ax.set_xticks(ticks)
ax.set_xticklabels(labels, rotation=35, ha="right")
ax.set_xlim(-0.5, len(R2_ord) - 0.5)
plt.tight_layout()
plt.show()
[WARN] Series not found in R2_matrix: ['NAPMPI', 'NAPMEI', 'NAPM', 'NAPMNOI', 'NAPMSDI', 'NAPMII', 'AMBSL', 'MZMSL', 'TWEXMMTH', 'PPIFGS', 'PPIFCG', 'PPIITM', 'PPICRM', 'NAPMPRI', 'CUUR0000SAD', 'CUUR0000SA0L2', 'S&P: indust']
Forecast¶
Now the paths of the paper and the Professor diverge.
The professor asks:
Using the best factor model, I would like to see the forecast for Industrial Production (the level, not the growth) and Inflation (the first difference of CPI) for the next six months.
For these two series, I would also like to see an evaluation of the forecast using a rolling windows of data. If you feel like, you can do a real time evaluation, by using the same data a researcher would use if she were to run the model in the last period of the estimation window.
1. Six - Month Forecast¶
print("NaN in CPIAUCSL:", df["CPIAUCSL"].isna().sum())
print("NaN in INDPRO:", df["INDPRO"].isna().sum())
NaN in CPIAUCSL: 0 NaN in INDPRO: 0
Since the original series don't have NaN we can avoid re-contructing the series. We can use the original ones
Y_df_6m = df[["Date", "CPIAUCSL", "INDPRO"]].copy()
# Date as index
Y_df_6m["Date"] = pd.to_datetime(Y_df_6m["Date"])
Y_df_6m = Y_df_6m.set_index("Date")
print(Y_df_6m.head())
CPIAUCSL INDPRO Date 1959-01-01 29.01 21.9616 1959-02-01 29.00 22.3917 1959-03-01 28.97 22.7142 1959-04-01 28.98 23.1981 1959-05-01 29.04 23.5476
Now let's compute the first difference fo CPIAUCSL and remove also first row
# Compute the first difference on CPIAUCSL
Y_df_6m["diff_CPIAUCSL"] = Y_df_6m["CPIAUCSL"].diff()
# Remove the first two rows with NaN (and keep other values consistent) and KEEP CONSISTENT WITH factors
Y_df_6m = Y_df_6m.iloc[2:]
Y_df_6m = Y_df_6m.drop(columns=["CPIAUCSL"])
print(Y_df_6m)
INDPRO diff_CPIAUCSL Date 1959-03-01 22.7142 -0.030 1959-04-01 23.1981 0.010 1959-05-01 23.5476 0.060 1959-06-01 23.5744 0.070 1959-07-01 23.0099 0.040 ... ... ... 2024-11-01 101.9619 0.885 2024-12-01 103.1177 1.154 2025-01-01 103.3418 1.483 2025-02-01 104.2202 0.689 2025-03-01 103.8892 -0.160 [793 rows x 2 columns]
print(factors.head())
F1 F2 F3 F4 F5 F6 \
Date
1959-03-01 -0.634074 -0.154981 0.012307 -0.109969 -0.135474 0.063457
1959-04-01 -0.672058 -0.087580 -0.039489 0.025380 -0.150690 0.210907
1959-05-01 -0.434015 -0.089333 0.030766 0.024450 -0.151005 0.072027
1959-06-01 -0.181955 -0.056294 -0.147395 0.112033 -0.112040 0.050307
1959-07-01 0.189512 -0.170787 -0.216749 0.020062 -0.254072 -0.007770
F7
Date
1959-03-01 0.079084
1959-04-01 -0.067066
1959-05-01 -0.047864
1959-06-01 0.011220
1959-07-01 0.139868
As in the paper we use the BIC to choose best lag both for dependent variable Y and iindependent variables X's
# Create lags of the variables
def create_lagged_matrix(series: pd.DataFrame, max_lag: int, prefix: str = None) -> pd.DataFrame:
lagged_data = {}
for col in series.columns:
for lag in range(1, max_lag + 1):
col_name = f"{prefix}_{col}_lag{lag}" if prefix else f"{col}_lag{lag}"
lagged_data[col_name] = series[col].shift(lag)
return pd.DataFrame(lagged_data, index=series.index)
# Evaluate all lag combinations for Y and factors
def evaluate_lags(Y_df: pd.DataFrame, target_col: str,
factors: pd.DataFrame,
max_lag_y: int = 6,
max_lag_f: int = 6) -> pd.DataFrame:
results = []
for p, m in product(range(max_lag_y + 1), range(1, max_lag_f + 1)):
y = Y_df[[target_col]].copy()
y.columns = ['Y'] # target at t+1
# Lags of Y (from 1 to p)
if p > 0:
y_lags = create_lagged_matrix(Y_df[[target_col]], p, prefix='Y')
else:
y_lags = pd.DataFrame(index=Y_df.index)
# Lags of the factors (from 1 to m)
f_lags = create_lagged_matrix(factors, m, prefix='F')
# Build final dataset
X = pd.concat([y_lags, f_lags], axis=1)
Y_target = y.shift(-1) # target at t+1
data = pd.concat([Y_target, X], axis=1).dropna()
if data.empty:
continue
y_final = data['Y']
X_final = sm.add_constant(data.drop(columns='Y'))
model = sm.OLS(y_final, X_final).fit()
results.append({
'lag_Y': p,
'lag_F': m,
'AIC': model.aic,
'BIC': model.bic,
'R2': model.rsquared
})
return pd.DataFrame(results).sort_values(by='BIC').reset_index(drop=True)
INDPRO¶
results_IND = evaluate_lags(Y_df_6m, target_col="INDPRO", factors=factors,
max_lag_y=6, max_lag_f=6)
bic_6m_IND = results_IND.sort_values(by='BIC').head(5)
print("MIGLIORI MODELLI SECONDO BIC")
print(bic_6m_IND[['lag_Y', 'lag_F', 'BIC', 'R2']].to_string(index=False))
MIGLIORI MODELLI SECONDO BIC
lag_Y lag_F BIC R2
3 1 2375.467375 0.998480
6 1 2375.729966 0.998489
1 1 2378.052052 0.998467
4 1 2378.771488 0.998477
2 1 2379.500400 0.998468
diff_CPIAUCSL¶
results_CPI = evaluate_lags(Y_df_6m, target_col="diff_CPIAUCSL", factors=factors,
max_lag_y=6, max_lag_f=6)
bic_6m_CPI = results_CPI.sort_values(by='BIC').head(5)
print("MIGLIORI MODELLI SECONDO BIC")
print(bic_6m_CPI[['lag_Y', 'lag_F', 'BIC', 'R2']].to_string(index=False))
MIGLIORI MODELLI SECONDO BIC
lag_Y lag_F BIC R2
6 1 1083.646067 0.178863
5 1 1097.017813 0.158363
4 1 1098.018519 0.150947
3 1 1099.174212 0.143283
1 1 1112.052241 0.115923
Forecast INDPRO¶
Let's create the optimal number of lags choosen
# Create lags of INDPRO up to the third lag
indpro_lags = create_lagged_matrix(Y_df_6m[["INDPRO"]], max_lag=3)
indpro_lags.dropna(inplace=True)
# Create lags of the factors up to the first lag
factors_lags_IND = create_lagged_matrix(factors, max_lag=1)
factors_lags_IND.dropna(inplace=True)
final_IND = indpro_lags.join(factors_lags_IND, how='inner')
final_IND = Y_df_6m['INDPRO'].to_frame().join(final_IND, how='inner')
print(final_IND)
INDPRO INDPRO_lag1 INDPRO_lag2 INDPRO_lag3 F1_lag1 \
Date
1959-06-01 23.5744 23.5476 23.1981 22.7142 -0.434015
1959-07-01 23.0099 23.5744 23.5476 23.1981 -0.181955
1959-08-01 22.2304 23.0099 23.5744 23.5476 0.189512
1959-09-01 22.2035 22.2304 23.0099 23.5744 0.765563
1959-10-01 22.0422 22.2035 22.2304 23.0099 -0.010836
... ... ... ... ... ...
2024-11-01 101.9619 102.2138 102.5954 103.0196 0.222903
2024-12-01 103.1177 101.9619 102.2138 102.5954 0.026669
2025-01-01 103.3418 103.1177 101.9619 102.2138 -0.141360
2025-02-01 104.2202 103.3418 103.1177 101.9619 -0.035340
2025-03-01 103.8892 104.2202 103.3418 103.1177 -0.009237
F2_lag1 F3_lag1 F4_lag1 F5_lag1 F6_lag1 F7_lag1
Date
1959-06-01 -0.089333 0.030766 0.024450 -0.151005 0.072027 -0.047864
1959-07-01 -0.056294 -0.147395 0.112033 -0.112040 0.050307 0.011220
1959-08-01 -0.170787 -0.216749 0.020062 -0.254072 -0.007770 0.139868
1959-09-01 -0.100290 -0.384532 0.009054 -0.433862 -0.171845 0.182504
1959-10-01 0.072548 -0.226914 0.337323 -0.243761 -0.209014 -0.143375
... ... ... ... ... ... ...
2024-11-01 0.002868 -0.166846 0.160544 0.089281 -0.062023 0.089715
2024-12-01 -0.018730 -0.052244 0.147181 0.083626 -0.002342 0.043366
2025-01-01 0.083604 -0.058928 -0.003179 0.116681 -0.048115 -0.035268
2025-02-01 -0.002955 -0.125281 0.149342 -0.003125 0.139905 0.015350
2025-03-01 -0.160394 0.104215 -0.116207 0.098661 0.023124 -0.008177
[790 rows x 11 columns]
OLS¶
# Extract Y and X
Y_IND = final_IND.iloc[:, 0].values.reshape(-1, 1) # INDPRO
X_IND_raw = final_IND.iloc[:, 1:] # All others as regressors
# Add constant (column of 1s)
X_IND = np.hstack([np.ones((X_IND_raw.shape[0], 1)), X_IND_raw.values])
var_names_IND = ['const'] + list(X_IND_raw.columns)
# OLS estimation: β = (X'X)^(-1) X'Y
beta_IND = np.linalg.inv(X_IND.T @ X_IND) @ (X_IND.T @ Y_IND)
# Predictions
Y_hat_IND = X_IND @ beta_IND
# Residuals
residuals_IND = Y_IND - Y_hat_IND
# R²
ss_total_IND = np.sum((Y_IND - np.mean(Y_IND)) ** 2)
ss_resid_IND = np.sum(residuals_IND ** 2)
r_squared_IND = 1 - (ss_resid_IND / ss_total_IND)
# Output with variable names
coef_df_IND = pd.DataFrame({
'Variable': var_names_IND,
'Beta': beta_IND.flatten()
})
print("Coefficienti stimati (beta_IND):")
print(coef_df_IND.to_string(index=False))
print(f"\nR²_IND: {r_squared_IND:.6f}")
Coefficienti stimati (beta_IND):
Variable Beta
const 0.262131
INDPRO_lag1 1.032763
INDPRO_lag2 -0.178734
INDPRO_lag3 0.143785
F1_lag1 -0.604878
F2_lag1 0.315490
F3_lag1 -0.002732
F4_lag1 0.194542
F5_lag1 -0.378690
F6_lag1 -0.071213
F7_lag1 1.246587
R²_IND: 0.999420
Assumption: Persistence of Factors¶
We assume that the values of the factors remain constant over the forecast horizon.
In other words, the lagged values of $F1–F7$ at time $T$ are reused for all future periods $T+1$ to $T+6$.
- Fast and simple: no need to estimate additional models.
- Not professional for extremely accuracy forecast
p_IND = 3 # lag of INDPRO
m_IND = 1 # lag of factors
pred_cols_IND = (
[f"INDPRO_lag{i}" for i in range(1, p_IND + 1)] +
[f"F{i}_lag{m_IND}" for i in range(1, 8)]
)
# 2) h-STEP FORECAST FUNCTION FOR INDPRO
def forecast_INDPRO(df_ind: pd.DataFrame,
beta_vec: np.ndarray,
h: int = 6) -> pd.Series:
preds, dates = [], []
last_obs = df_ind.iloc[-1].copy() # last observation
last_date = pd.to_datetime(df_ind.index[-1])
const = 1.0
for step in range(1, h + 1):
X_in = np.concatenate([[const], last_obs[pred_cols_IND].values])
y_pred = float(X_in @ beta_vec) # forecast INDPRO
preds.append(y_pred)
dates.append(last_date + pd.offsets.MonthBegin(step))
# update the 3 lags of INDPRO
for lag in range(p_IND, 1, -1):
last_obs[f"INDPRO_lag{lag}"] = last_obs[f"INDPRO_lag{lag-1}"]
last_obs["INDPRO_lag1"] = last_obs["INDPRO"]
last_obs["INDPRO"] = y_pred # new value at t
# fixed factors (persistence)
# Fi_lag1 remain unchanged → no update
return pd.Series(preds, index=dates, name="INDPRO_forecast")
# 3) DATAFRAME WITH REQUIRED COLUMNS FOR FORECAST
final_IND_sel = final_IND[["INDPRO"] + pred_cols_IND]
# 4) 6-MONTH FORECAST
forecast_6m_IND = forecast_INDPRO(
final_IND_sel,
beta_IND.flatten(), # vector (k+1,)
h=6
)
print("\nINDPRO forecasts for the next 6 months:")
print(forecast_6m_IND.round(3))
INDPRO forecasts for the next 6 months: 2025-04-01 104.136 2025-05-01 103.669 2025-06-01 104.109 2025-07-01 103.536 2025-08-01 104.109 2025-09-01 103.371 Name: INDPRO_forecast, dtype: float64
# 1) Last 12 months of observed INDPRO
actual_last12_IND = final_IND["INDPRO"].iloc[-12:]
# 2) Forecast series for 6 months
fcst_IND = forecast_6m_IND
# 3) Combine observed + forecast (long format for plotting)
plot_series_IND = (
pd.concat(
{"INDPRO (observed)": actual_last12_IND,
"INDPRO (forecast)": fcst_IND}
)
.to_frame(name="value")
.reset_index()
.rename(columns={"level_0": "Series",
"level_1": "Date"})
)
# 4) Plot with "bridge" between last observed and first forecast
fig, ax = plt.subplots(figsize=(8, 4), dpi=500)
forecast_color = None # will be stored on the fly
for lbl, grp in plot_series_IND.groupby("Series"):
line, = ax.plot(grp["Date"], grp["value"],
marker="" if "forecast" in lbl else None,
linestyle="-",
label=lbl)
if "forecast" in lbl: # store the forecast line color
forecast_color = line.get_color()
# — bridge segment in same color as forecast —
last_obs_date = actual_last12_IND.index[-1]
last_obs_value = actual_last12_IND.iloc[-1]
first_fcst_date = fcst_IND.index[0]
first_fcst_val = fcst_IND.iloc[0]
ax.plot([last_obs_date, first_fcst_date],
[last_obs_value, first_fcst_val],
color=forecast_color, linestyle="-", linewidth=1.2)
# — plot details —
ax.set_title("INDPRO: last 12 months observed + 6-month forecast")
ax.set_ylabel("Level (index 2017=100)")
ax.legend()
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
Forecast CPI¶
cpi_lags = create_lagged_matrix(Y_df_6m[["diff_CPIAUCSL"]], max_lag=6) # diff_CPIAUCSL_lag1 … lag6
cpi_lags.dropna(inplace=True)
factors_lags_CPI = create_lagged_matrix(factors, max_lag=1) # F1_lag1 … F7_lag1
factors_lags_CPI.dropna(inplace=True)
final_CPI = cpi_lags.join(factors_lags_CPI, how="inner")
final_CPI = Y_df_6m["diff_CPIAUCSL"].to_frame().join(final_CPI, how="inner")
final_CPI.columns = [
col.replace("diff_CPIAUCSL", "diff_CPIAUCSL_CPI")
if col.startswith("diff_CPIAUCSL") else f"{col}_CPI"
for col in final_CPI.columns
]
print(final_CPI.head())
diff_CPIAUCSL_CPI diff_CPIAUCSL_CPI_lag1 diff_CPIAUCSL_CPI_lag2 \
Date
1959-09-01 0.07 0.03 0.04
1959-10-01 0.10 0.07 0.03
1959-11-01 0.00 0.10 0.07
1959-12-01 0.06 0.00 0.10
1960-01-01 -0.04 0.06 0.00
diff_CPIAUCSL_CPI_lag3 diff_CPIAUCSL_CPI_lag4 \
Date
1959-09-01 0.07 0.06
1959-10-01 0.04 0.07
1959-11-01 0.03 0.04
1959-12-01 0.07 0.03
1960-01-01 0.10 0.07
diff_CPIAUCSL_CPI_lag5 diff_CPIAUCSL_CPI_lag6 F1_lag1_CPI \
Date
1959-09-01 0.01 -0.03 0.765563
1959-10-01 0.06 0.01 -0.010836
1959-11-01 0.07 0.06 0.317555
1959-12-01 0.04 0.07 0.020195
1960-01-01 0.03 0.04 -1.387339
F2_lag1_CPI F3_lag1_CPI F4_lag1_CPI F5_lag1_CPI F6_lag1_CPI \
Date
1959-09-01 -0.100290 -0.384532 0.009054 -0.433862 -0.171845
1959-10-01 0.072548 -0.226914 0.337323 -0.243761 -0.209014
1959-11-01 -0.215715 -0.095061 -0.017248 -0.114601 -0.077012
1959-12-01 -0.164277 -0.029766 0.019729 -0.067122 0.054608
1960-01-01 0.008380 0.257560 0.035697 0.034781 0.019750
F7_lag1_CPI
Date
1959-09-01 0.182504
1959-10-01 -0.143375
1959-11-01 0.006941
1959-12-01 -0.179049
1960-01-01 -0.325786
Y_CPI = final_CPI.iloc[:, 0].values.reshape(-1, 1) # first column = target
X_CPI_raw = final_CPI.iloc[:, 1:] # all others = regressors
X_CPI = np.hstack([np.ones((X_CPI_raw.shape[0], 1)), X_CPI_raw.values])
var_names_CPI = ["const"] + list(X_CPI_raw.columns)
beta_CPI = np.linalg.inv(X_CPI.T @ X_CPI) @ (X_CPI.T @ Y_CPI)
Y_hat_CPI = X_CPI @ beta_CPI
residuals_CPI = Y_CPI - Y_hat_CPI
ss_tot_CPI = np.sum((Y_CPI - np.mean(Y_CPI)) ** 2)
ss_res_CPI = np.sum(residuals_CPI ** 2)
r_squared_CPI = 1 - ss_res_CPI / ss_tot_CPI
coef_df_CPI = pd.DataFrame({
"Variable": var_names_CPI,
"Beta": beta_CPI.flatten()
})
print("\nCoefficienti stimati (beta_CPI):")
print(coef_df_CPI.to_string(index=False))
print(f"\nR²_CPI: {r_squared_CPI:.6f}")
Coefficienti stimati (beta_CPI):
Variable Beta
const 0.127399
diff_CPIAUCSL_CPI_lag1 0.420906
diff_CPIAUCSL_CPI_lag2 -0.052911
diff_CPIAUCSL_CPI_lag3 0.100514
diff_CPIAUCSL_CPI_lag4 0.099478
diff_CPIAUCSL_CPI_lag5 0.009028
diff_CPIAUCSL_CPI_lag6 0.080149
F1_lag1_CPI -0.101769
F2_lag1_CPI 0.251778
F3_lag1_CPI -0.178585
F4_lag1_CPI 0.053161
F5_lag1_CPI 0.046578
F6_lag1_CPI -0.158697
F7_lag1_CPI 0.584645
R²_CPI: 0.411454
# 1. LAGS TO USE (the same chosen using BIC)
# p_CPI = number of lags for CPI m_CPI = number of factor lags
p_CPI = 6
m_CPI = 1
pred_cols_CPI = (
[f"diff_CPIAUCSL_CPI_lag{i}" for i in range(1, p_CPI + 1)] +
[f"F{i}_lag{m_CPI}_CPI" for i in range(1, 8)]
)
# 2.
print("β_CPI available? shape:", beta_CPI.shape)
# 3. ITERATIVE h-STEP FORECAST FUNCTION FOR CPI
def forecast_diffCPI(df_CPI: pd.DataFrame,
beta_vec: np.ndarray,
h: int = 6) -> pd.Series:
preds, dates = [], []
last_obs = df_CPI.iloc[-1].copy() # last available row
last_date = pd.to_datetime(df_CPI.index[-1])
const = 1.0
for step in range(1, h + 1):
X_in = np.concatenate([[const], last_obs[pred_cols_CPI].values])
y_pred = float(X_in @ beta_vec) # forecast ΔCPI
preds.append(y_pred)
dates.append(last_date + pd.offsets.MonthBegin(step))
# ---- update the 6 lags of the variable -------------------------
for lag in range(p_CPI, 1, -1):
last_obs[f"diff_CPIAUCSL_CPI_lag{lag}"] = \
last_obs[f"diff_CPIAUCSL_CPI_lag{lag-1}"]
last_obs["diff_CPIAUCSL_CPI_lag1"] = last_obs["diff_CPIAUCSL_CPI"]
last_obs["diff_CPIAUCSL_CPI"] = y_pred # new value at t
# ---- factors: assume constant factors --------------------------
# (F?_lag1_CPI remain unchanged → no update)
return pd.Series(preds, index=dates, name="diff_CPIAUCSL_forecast")
# 4. BUILD THE DATAFRAME WITH REQUIRED REGRESSORS
final_CPI_sel = final_CPI[["diff_CPIAUCSL_CPI"] + pred_cols_CPI]
# 5. 6-MONTH FORECAST
forecast_6m_CPI = forecast_diffCPI(
final_CPI_sel,
beta_CPI.flatten(), # vector (k+1,)
h=6
)
β_CPI available? shape: (14, 1)
print("\nCPI forecasts for the next 6 months:")
print(forecast_6m_CPI.round(3))
CPI forecasts for the next 6 months: 2025-04-01 0.539 2025-05-01 0.285 2025-06-01 0.593 2025-07-01 0.309 2025-08-01 0.457 2025-09-01 0.294 Name: diff_CPIAUCSL_forecast, dtype: float64
# 1) last 12 months of observed data
actual_last12_CPI = final_CPI["diff_CPIAUCSL_CPI"].iloc[-12:]
# 2) 6-month forecast
fcst_CPI = forecast_6m_CPI
# 3) concatenate for plotting (unchanged structure)
plot_series_CPI = (
pd.concat(
{"diff CPI (observed)": actual_last12_CPI,
"diff CPI (forecast)": fcst_CPI}
)
.to_frame(name="value")
.reset_index()
.rename(columns={"level_0": "Series",
"level_1": "Date"})
)
fig, ax = plt.subplots(figsize=(8, 4), dpi=600)
forecast_color = None ### init
for lbl, grp in plot_series_CPI.groupby("Series"):
line, = ax.plot(grp["Date"], grp["value"],
marker="" if "forecast" in lbl else None,
linestyle="-",
label=lbl)
if "forecast" in lbl: ### store forecast line color
forecast_color = line.get_color()
# bridge segment with correct color
last_obs_date = actual_last12_CPI.index[-1]
last_obs_value = actual_last12_CPI.iloc[-1]
first_fcst_date = fcst_CPI.index[0]
first_fcst_val = fcst_CPI.iloc[0]
ax.plot([last_obs_date, first_fcst_date],
[last_obs_value, first_fcst_val],
color=forecast_color, linestyle="-", linewidth=1.2)
# plot details
ax.set_title("diff CPIAUCSL: last 12 months observed + 6-month forecast")
ax.set_ylabel("First-difference CPI (pp)")
ax.legend()
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
2. Real Time Evaluation¶
The dataset is split into 80% as train data and 20% as test data
We act as if we knew data up to t, we estimate the factors and use their lags, in this way we simulate the reality.
E.G. Today we can use just the data we know $\rightarrow$ no future informations
So each 'month' we estimate the model and then make a forecast about next month. When the real info is available we include it in our data and estimate the model again to predict next month observation and so on.
NOTE:
For this
real time evaluationwe are going to skip the outliers procedureThe model will be re-estimated each month, i.e. each observation
# To work directly with df
df["Date"] = pd.to_datetime(df["Date"])
df = df.set_index("Date")
Loop for Real Time Evaluation¶
Let's create a silent version of the em_factors function such that we are not gonna have a confusing output
def em_factors_silent(
df_transformed: pd.DataFrame, # NaN “veri”
df_missing: pd.DataFrame, # NaN → μ
df_standardized: pd.DataFrame, # z-score
*,
kmax: int = 15,
tol: float = 1e-6,
max_iter: int = 200,
print_every: int | None = 0 # 0/None → nessun log
):
idx, cols = df_transformed.index, df_transformed.columns
mask_nan = df_transformed.isna().to_numpy()
X_std = df_standardized.to_numpy(float)
X_hat_prev = np.zeros_like(X_std)
err, it = np.inf, 0
while err > tol and it < max_iter:
it += 1
r_star = pc_p2_criterion(X_std, kmax)
F_hat, Λ_hat, X_hat, _ = pca_stock_watson(X_std, r_star)
if it > 1:
err = ((X_hat - X_hat_prev)**2).sum() / (X_hat_prev**2).sum()
X_hat_prev = X_hat
if print_every and (it == 1 or it % print_every == 0):
print(f"Iter {it:3d} | r*={r_star:2d} | err={err:9.2e}")
μ, σ = X_std.mean(0), X_std.std(0, ddof=0)
σ[σ == 0] = 1.0
X_unstd = X_std*σ + μ
X_unstd[mask_nan] = (X_hat*σ + μ)[mask_nan]
μn, σn = X_unstd.mean(0), X_unstd.std(0, ddof=0)
σn[σn == 0] = 1.0
X_std = (X_unstd - μn) / σn
return (pd.DataFrame(F_hat, idx, [f"F{i+1}" for i in range(r_star)]),
pd.DataFrame(Λ_hat, cols, [f"F{i+1}" for i in range(r_star)]),
r_star,
pd.DataFrame(X_unstd, idx, cols),
pd.DataFrame(X_std, idx, cols),
pd.DataFrame(X_hat, idx, cols))
INDPRO (Level)¶
Lags of
INDPRO= $3$Lags of
factors= $1$
# PARAMETERS (same as your original block)
p_IND = 3 # lags of INDPRO
m_IND = 1 # lags of factors (can be 1, 2, 3, ...)
kmax_em = 15
tol_em = 1e-6
maxiter_em = 200
progress_mod = 16 # print ETA every 16 iterations
# DATA SPLIT
df_train = df_transformed.loc[:'2011-11-01'].copy()
df_test = df_transformed.loc['2011-12-01':].copy()
fcst_dates, fcst_values = [], []
tic, steps_total, step_count = time.time(), len(df_test), 0
# ROLLING-WINDOW LOOP
while not df_test.empty:
# 1) fill NaNs + standardize
df_nan = df_train.fillna(df_train.mean())
df_std = (df_nan - df_nan.mean()) / df_nan.std()
# 2) extract factors via EM (silent)
factors_train, _, r_opt, *_ = em_factors_silent(
df_transformed = df_train,
df_missing = df_nan,
df_standardized = df_std,
kmax = kmax_em,
tol = tol_em,
max_iter = maxiter_em,
print_every = 0
)
# 3) regressors = lags 1…m_IND of factors + lags 1…p_IND of INDPRO
lag_F = [factors_train.shift(l).add_suffix(f"_lag{l}")
for l in range(1, m_IND + 1)]
lag_Y = [df.loc[:, "INDPRO"].shift(l).to_frame(f"INDPRO_lag{l}")
for l in range(1, p_IND + 1)]
X_train = pd.concat(lag_F + lag_Y, axis=1).dropna()
# 4) target Y
Y_train = df.loc[X_train.index, "INDPRO"]
# 5) OLS
Xc = np.hstack([np.ones((len(X_train), 1)), X_train.values])
beta = np.linalg.inv(Xc.T @ Xc) @ (Xc.T @ Y_train.values.reshape(-1, 1))
# 6) regressor vector for t+1
# • factor blocks: F(t), F(t-1)…F(t-(m_IND-1))
F_blocks = [factors_train.shift(l).iloc[-1].values
for l in range(0, m_IND)] # 0 = current
# • INDPRO scalars: lag 1…p_IND
Y_scalars = [df.loc[df_train.index[-1-l], "INDPRO"]
for l in range(p_IND)]
x_new = np.hstack([1.0, *F_blocks, *Y_scalars])
y_hat = float((x_new @ beta).squeeze()) # no warnings
# 7) save forecast
next_date = df_test.index[0]
fcst_dates.append(next_date)
fcst_values.append(y_hat)
# 8) roll-forward
df_train = pd.concat([df_train, df_test.iloc[[0]]])
df_test = df_test.iloc[1:]
# 9) progress bar
step_count += 1
if step_count % progress_mod == 0 or step_count == steps_total:
elapsed = time.time() - tic
eta_sec = (elapsed / step_count) * (steps_total - step_count)
eta_str = time.strftime('%H:%M:%S', time.gmtime(eta_sec))
print(f"[{step_count:>3}/{steps_total}] r*={r_opt:2d} ETA ≈ {eta_str}")
# OUT-OF-SAMPLE FORECAST SERIES
forecast_INDPRO_oos = pd.Series(fcst_values,
index=pd.to_datetime(fcst_dates),
name="INDPRO_forecast")
[ 16/160] r*= 7 ETA ≈ 00:02:40 [ 32/160] r*= 7 ETA ≈ 00:02:16 [ 48/160] r*= 7 ETA ≈ 00:02:00 [ 64/160] r*= 7 ETA ≈ 00:01:44 [ 80/160] r*= 7 ETA ≈ 00:01:33 [ 96/160] r*= 7 ETA ≈ 00:01:16 [112/160] r*= 7 ETA ≈ 00:01:07 [128/160] r*= 7 ETA ≈ 00:00:53 [144/160] r*= 7 ETA ≈ 00:00:29 [160/160] r*= 7 ETA ≈ 00:00:00
# 1. Extract the actual INDPRO series corresponding to the OOS forecast dates
y_true = df.loc[forecast_INDPRO_oos.index, "INDPRO"].astype(float)
# 2. Forecast series (already in correct order and with same index)
y_pred = forecast_INDPRO_oos.astype(float)
# 3. MSE and RMSE
mse = np.mean((y_true - y_pred) ** 2)
rmse = np.sqrt(mse)
print(f"MSE : {mse:.4f}")
print(f"RMSE : {rmse:.4f}")
MSE : 1.8641 RMSE : 1.3653
# Actual series and one-step-ahead forecasts
actual_IND = y_true # observed out-of-sample values
pred_IND = forecast_INDPRO_oos.reindex(actual_IND.index) # align on the same dates
# Plot
fig, ax = plt.subplots(figsize=(10,4), dpi=500)
ax.plot(actual_IND.index, actual_IND.values,
label="INDPRO (observed)", color="tab:blue")
ax.plot(pred_IND.index, pred_IND.values,
label="INDPRO (1-step forecast)", color="tab:orange", linestyle="-")
ax.set_title("INDPRO – one-step-ahead rolling forecasts vs. actual")
ax.set_ylabel("Index level")
ax.legend()
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
CPIAUSCL (First Difference)¶
Lags of
CPIAUSCL= $6$Lags of
factors= $1$
# TARGET VARIABLE (Δ CPI)
df_diff = df.copy() # df = levels, already in workspace
df_diff["diff_CPIAUCSL"] = df["CPIAUCSL"].diff()
df_diff = df_diff.iloc[1:] # drop first NaN row
# align indices: keep only common dates
common_idx = df_transformed.index.intersection(df_diff.index)
df_diff = df_diff.loc[common_idx].copy()
df_transf_diff = df_transformed.loc[common_idx].copy() # now same dates
# PARAMETERS
p_CPI = 6 # lags of ΔCPI to use as regressors
m_CPI = 1 # lags of factors
kmax_em = 15
tol_em = 1e-6
maxiter_em = 200
progress_mod = 16
# TRAIN / TEST SPLIT
df_train = df_transf_diff.loc[:'2011-11-01'].copy()
df_test = df_transf_diff.loc['2011-12-01':].copy()
fcst_dates, fcst_values = [], []
tic, steps_total, step_count = time.time(), len(df_test), 0
# ROLLING-WINDOW ONE-STEP-AHEAD – ΔCPI
while not df_test.empty:
# 1) NaNs → mean + standardize
df_nan = df_train.fillna(df_train.mean())
df_std = (df_nan - df_nan.mean()) / df_nan.std()
# 2) extract factors via EM
F_train, _, r_opt, *_ = em_factors_silent(
df_transformed = df_train,
df_missing = df_nan,
df_standardized = df_std,
kmax = kmax_em,
tol = tol_em,
max_iter = maxiter_em,
print_every = 0
)
# 3) X = lag(1…m_CPI) of factors + lag(1…p_CPI) of ΔCPI
lag_F = [F_train.shift(l).add_suffix(f"_lag{l}")
for l in range(1, m_CPI + 1)]
lag_Y = [df_diff["diff_CPIAUCSL"].shift(l).to_frame(f"diff_CPI_lag{l}")
for l in range(1, p_CPI + 1)]
X_train = pd.concat(lag_F + lag_Y, axis=1).dropna()
# 4) target Y = aligned ΔCPI
Y_train = df_diff.loc[X_train.index, "diff_CPIAUCSL"]
# 5) OLS
Xc = np.hstack([np.ones((len(X_train), 1)), X_train.values])
beta = np.linalg.inv(Xc.T @ Xc) @ (Xc.T @ Y_train.values.reshape(-1, 1))
# 6) regressor vector for t+1
F_blocks = [F_train.shift(l).iloc[-1].values for l in range(0, m_CPI)]
Y_scalars = [df_diff.loc[df_train.index[-1-l], "diff_CPIAUCSL"]
for l in range(p_CPI)]
x_new = np.hstack([1.0, *F_blocks, *Y_scalars])
y_hat = float((x_new @ beta).squeeze()) # forecast ΔCPI_{t+1}
# 7) save
fcst_dates.append(df_test.index[0])
fcst_values.append(y_hat)
# 8) roll-forward
df_train = pd.concat([df_train, df_test.iloc[[0]]])
df_test = df_test.iloc[1:]
# 9) progress bar
step_count += 1
if step_count % progress_mod == 0 or step_count == steps_total:
elapsed = time.time() - tic
eta = time.strftime('%H:%M:%S',
time.gmtime((elapsed/step_count)*(steps_total-step_count)))
print(f"[{step_count:>3}/{steps_total}] r*={r_opt:2d} ETA ≈ {eta}")
# OOS FORECAST SERIES (ΔCPI)
forecast_CPI_oos = pd.Series(fcst_values,
index=pd.to_datetime(fcst_dates),
name="diff_CPIAUCSL_forecast")
[ 16/160] r*= 7 ETA ≈ 00:03:50 [ 32/160] r*= 7 ETA ≈ 00:02:56 [ 48/160] r*= 7 ETA ≈ 00:02:36 [ 64/160] r*= 7 ETA ≈ 00:02:15 [ 80/160] r*= 7 ETA ≈ 00:01:52 [ 96/160] r*= 7 ETA ≈ 00:01:29 [112/160] r*= 7 ETA ≈ 00:01:16 [128/160] r*= 7 ETA ≈ 00:00:57 [144/160] r*= 7 ETA ≈ 00:00:31 [160/160] r*= 7 ETA ≈ 00:00:00
# 1. Extract true ΔCPI values for forecast dates
y_true_CPI = df_diff.loc[forecast_CPI_oos.index,
"diff_CPIAUCSL"].astype(float)
# 2. Forecast series (already aligned)
y_pred_CPI = forecast_CPI_oos.astype(float)
# 3. MSE / RMSE
mse_CPI = np.mean((y_true_CPI - y_pred_CPI) ** 2)
rmse_CPI = np.sqrt(mse_CPI)
print(f"MSE (CPI) : {mse_CPI:.6f}")
print(f"RMSE (CPI) : {rmse_CPI:.6f}")
MSE (CPI) : 0.360018 RMSE (CPI) : 0.600015
# actual and predicted series
actual_CPI = y_true_CPI # observed OOS values
pred_CPI = y_pred_CPI.reindex(actual_CPI.index) # already same indices
fig, ax = plt.subplots(figsize=(10, 4), dpi=500)
ax.plot(actual_CPI.index, actual_CPI.values,
label="CPI (observed)", color="tab:blue")
ax.plot(pred_CPI.index, pred_CPI.values,
label="CPI (1-step forecast)", color="tab:orange")
ax.set_title("First Difference of CPI – one-step-ahead rolling forecasts vs. actual")
ax.set_ylabel("First difference CPI (pp)")
ax.legend()
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()